package xiaoa.java.spider.wx;

import java.io.File;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.alibaba.fastjson.JSON;

import java.util.AbstractMap.SimpleEntry;


/**
 * 抓取微信公众号
 * @author xiaoa
 * @date 2017年9月7日 下午10:11:12
 * @version V1.0
 *
 */
public class FetchWxPublicSigna {
	
	
	/**
	 * 抓取分类
	 * @Title: fetchFL
	 * @return
	 * @author xiaoa
	 */
	public static List<Map.Entry<String, String>>  fetchFL()throws Throwable{
		
		
		List<Map.Entry<String, String>>  list = new ArrayList<>();
		
		Document  document =  Jsoup.parse(new URL("http://www.sovxin.com/"), (int)TimeUnit.MINUTES.toMillis(5));
		
		
		// 抓取分类公众号
		Elements  esF = document.select("._i_fenlei").select("._if_title").select("a");
		
		for (Element e : esF){
			list.add(new SimpleEntry<>( e.text(), e.attr("href")));
		}
		
		
		// 抓取分类公众号
		Elements  esT = document.select("._i_gedi").select("._ig_line");
		
		for (Element e : esT){
			String title = e.select("._ig_name").attr("title");
			
			String url =  e.select("._ig_name").attr("href");
			
			list.add(new SimpleEntry<>(title,url));
		}
		
		return list;
		
	}
	
	
	/**
	 * 抓取账号列表
	 * @Title: fetchAccount
	 * @param url
	 * @author xiaoa
	 */
	public static List<String> fetchAccount(String url , String key)throws Throwable{
		
		List<String> list = new ArrayList<>();
		int i = 1;
		while(i > 0){
			String url1 = url.substring(0, url.indexOf(".html")) + "_" + i + ".html";
			
			i ++;
			System.out.print(url1);
			
			Document  document =  Jsoup.parse(new URL(url1), (int)TimeUnit.MINUTES.toMillis(5));
			
			Elements es  =  document.select(".all").select("._list").select("._box").select("p").select("a");
			
			for (Element e : es){
				
				String uInfo = " error";
				try {
					 uInfo = fetchUserInfo("http://www.sovxin.com" + e.attr("href"));
				} catch (Exception e1) {
				}
				
			     FileUtils.writeStringToFile(new File("e:/wxUserInfo.txt"), key + "^" + uInfo + "\n", Charset.defaultCharset()  ,true );				
			}
			
			System.out.println(" size = " + es.size());
			
			if (es.size() == 0){
				
				System.out.println("break " + url);
				break;
			}
			
		}
		
		return list;
		
	}
	
	 
	static ThreadPoolExecutor  pool = new  ThreadPoolExecutor(35, 35, 20L, TimeUnit.MINUTES, new ArrayBlockingQueue<Runnable>(1000));
	
	
	/**
	 * 获取用户信息
	 * @Title: fetchUserInfo
	 * @param url
	 * @return
	 * @throws Throwable
	 * @author xiaoa
	 */
	public static String fetchUserInfo(String url)throws Throwable{
		
		Document  document =  Jsoup.parse(new URL(url), (int)TimeUnit.MINUTES.toMillis(5));
		
		String userName = document.select("._wxgzzh").prev().text();
		
		String userAc = document.select(".name_view").select("tr").get(1).select("td").get(0).text();
		
		userAc = userAc.replace("微信号：", "");
		
		return userName  + "^" + userAc;
		
	}
	
	
//	
//	public static void main(String[] args) throws Throwable{
//		
//		
//		fetchAccount("http://www.sovxin.com//diqu_haiwai_52.html");
//		
//	}
	
	
	public static void main(String[] args) throws Throwable{
		
		List<String> wList = FileUtils.readLines(new File("e:/wxUserInfo.txt") , "utf-8");
		
		Set<String>  set = new HashSet<>();
		
		for (String l : wList){
			
			set.add(l.split("\\^")[0]);
		}
		
		List<Map.Entry<String, String>>   flList =  fetchFL();
		
		for ( int i =  flList.size()-1 ; i >= 0 ; i --   ){
			
			
			final Map.Entry<String, String> fl = flList.get(i);
			
			if (set.contains(fl.getKey())){
				
				System.out.println("========== continue " + JSON.toJSONString(fl));
				
				continue;
			}
			
			pool.execute( new Runnable() {
				
				int retry = 0;
				
				public void run() {
					
					try {
						
						retry ++;
						
						System.out.println("==============" + JSON.toJSONString(fl));
						
						fetchAccount("http://www.sovxin.com/" + fl.getValue() ,fl.getKey() );
						
						
						
					} catch (Throwable e) {
						if (retry <= 10){
							pool.execute(this);
						}
						e.printStackTrace();
					}
					
				}
			});
			
			
			
		}
		
		
		System.out.println(JSON.toJSONString(fetchUserInfo("http://www.sovxin.com/weixin_118696.html")));
		
	}
	
	

}
